{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         ﻿The\n",
       "1      Project\n",
       "2    Gutenberg\n",
       "3        EBook\n",
       "4           of\n",
       "dtype: object"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Open the file `alice-in-wonderland.txt`, and read it into a `pandas` series\n",
    "# or data frame, such that each word is a separate value.\n",
    "filename = '../data/alice-in-wonderland.txt'\n",
    "\n",
    "s = Series(open(filename).read().split())\n",
    "s.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "the     732\n",
       "and     362\n",
       "a       321\n",
       "to      311\n",
       "of      300\n",
       "in      211\n",
       "she     197\n",
       "was     160\n",
       "said    129\n",
       "it      122\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# What are the 10 most common words in the book?\n",
    "s.value_counts().head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "the    792\n",
       "and    379\n",
       "a      325\n",
       "to     318\n",
       "of     313\n",
       "she    232\n",
       "in     222\n",
       "was    160\n",
       "you    141\n",
       "it     136\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Does this change if we count the words without regard to case?\n",
    "(\n",
    "    s\n",
    "    .str\n",
    "    .lower()\n",
    "    .value_counts()\n",
    "    .head(10)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "the      735\n",
       "and      384\n",
       "a        322\n",
       "to       320\n",
       "of       303\n",
       "in       214\n",
       "she      201\n",
       "was      167\n",
       "Alice    166\n",
       "it       164\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Does it change if we remove punctuation marks and quote marks\n",
    "# from the beginning and end of each word?\n",
    "import string\n",
    "\n",
    "(\n",
    "    s\n",
    "    .str\n",
    "    .strip(string.punctuation)\n",
    "    .value_counts()\n",
    "    .head(10)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1100"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# How many capitalized words does the book contain?\n",
    "\n",
    "(\n",
    "    s\n",
    "    .loc[s.str.contains('^[A-Z]\\w*$', \n",
    "                        regex=True)]\n",
    "    .count()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1686"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# If we ignore punctuation and quotes before the start of a word, \n",
    "# how many capitalized words does the book contain?\n",
    "\n",
    "(\n",
    "    s\n",
    "    .loc[s.str.strip(string.punctuation)\n",
    "    .str\n",
    "    .contains('^[A-Z]\\w*$', regex=True)]\n",
    "    .count()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
